{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "import pickle\n",
    "import numpy as np\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open('features.pkl', 'rb') as fopen:\n",
    "    dataset = pickle.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['positive', 'negative'])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('p.pkl', 'rb') as fopen:\n",
    "    p = pickle.load(fopen)\n",
    "    \n",
    "with open('n.pkl', 'rb') as fopen:\n",
    "    n = pickle.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = dataset['positive'] + p + dataset['negative'] + n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y = [1] * len(dataset['positive']) + [1] * len(p) + [0] * len(dataset['negative']) + [0] * len(n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2min 45s, sys: 2min 21s, total: 5min 6s\n",
      "Wall time: 22.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "logistic = LogisticRegression().fit(train_X, train_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted = logistic.predict(test_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "    negative   0.992008  0.993000  0.992504      2000\n",
      "    positive   0.993086  0.992107  0.992596      2027\n",
      "\n",
      "    accuracy                       0.992550      4027\n",
      "   macro avg   0.992547  0.992553  0.992550      4027\n",
      "weighted avg   0.992551  0.992550  0.992550      4027\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\n",
    "    metrics.classification_report(\n",
    "        test_Y, predicted, target_names = ['negative', 'positive'], digits = 6\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('logistic-detect-jawi.pkl', 'wb') as fopen:\n",
    "    pickle.dump(logistic, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 13.2 s, sys: 3.06 s, total: 16.3 s\n",
      "Wall time: 10.6 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sgd = SGDClassifier(loss='modified_huber',alpha=0.001).fit(train_X, train_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "    negative   0.984213  0.997500  0.990812      2000\n",
      "    positive   0.997500  0.984213  0.990812      2027\n",
      "\n",
      "    accuracy                       0.990812      4027\n",
      "   macro avg   0.990857  0.990857  0.990812      4027\n",
      "weighted avg   0.990901  0.990812  0.990812      4027\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predicted = sgd.predict(test_X)\n",
    "\n",
    "print(\n",
    "    metrics.classification_report(\n",
    "        test_Y, predicted, target_names = ['negative', 'positive'], digits = 6\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('sgd-detect-jawi.pkl', 'wb') as fopen:\n",
    "    pickle.dump(sgd, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
